SURVDAT pull comparison

Digging into differences in the fall Bigelow survey data across different pulls of survdat

Objective is to isolate which main columns are inconsistent across the datasets and whether the problem is isolated to specific species.

For convenience we also limited the data to include years 2000-present.

Load SURVDAT Sources

SURVDAT Pulls

For the following report we looked at 5 candidate survdat datasets.

  1. A 2016 survdat dataset
  2. A 2019 survdat dataset
  3. An August 2020 survdat dataset
  4. A second 2020 survdat dataset obtained from Janet Nye to double check our 2020
  5. The most recent 2021 pull from Sean
# 2016
load(paste0(mills_path, "Projects/WARMEM/Old survey data/Survdat_Nye2016.RData"))
survdat_16 <- clean_names(survdat) %>% filter(est_year >= 2000)


# 2019
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_allseason.RData"))
survdat_19 <- clean_names(survdat) %>% filter(est_year >= 2000)

# 2020
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_Aug 2020.RData"))
survdat_20 <- clean_names(survdat) %>% filter(est_year >=  2000)

# Double check of 2020 from Janet on 01282021
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_Aug 2020_check01282021.RData"))
survdat_20_check <- clean_names(survdat) %>% filter(est_year >=  2000)

# 2021
load(paste0(res_path, "NMFS_trawl/survdat_slucey_01152021.RData"))
survdat_21 <- clean_names(survdat) %>% filter(year >= 2000)

# remove survdat
rm(survdat)

Run Cleanup Function

The cleanup function does a number of steps to each survdat source. This includes the removal of specific strata, the removal of non-spring/fall seasons, removal of vessels other than the Albatross or Bigelow, the dropping of NA biomass and abundance events, and the removal of certain species like shrimp.

It also matches up the stations with larger regions they fall within and the survey effort within each of those. This should not have an impact on any of the things we are tracking below.

The code for the cleanup function can be found here

# Cleanup functions
source(here("R/01_nefsc_ss_build.R"))

# Clean each up using the survdat prep function
trawldat_16 <- survdat_prep(survdat = survdat_16) %>% 
  mutate(survdat_source = "2016")
trawldat_19 <- survdat_prep(survdat = survdat_19) %>% 
  mutate(survdat_source = "2019")
trawldat_20 <- survdat_prep(survdat = survdat_20) %>% 
  mutate(survdat_source = "2020")
trawldat_20_check <- survdat_prep(survdat = survdat_20_check) %>% 
  mutate(survdat_source = "2020_check")
trawldat_21 <- survdat_prep(survdat = survdat_21) %>% 
  mutate(survdat_source = "2021")




rm(survdat_16, survdat_19, survdat_20, survdat_20_check, survdat_21)

ABUND/sum(NUMLEN) Comparisons

The ratio of survdat$abundance and sum(survdat$numlen) represents the scale of mismatch between the abundance of a species caught at in a tow when measured using the survdat$abundance column compared to the sum of survdat$numlen across all the lengths caught.

In theory they should be equal. Numbers greater than 1 indicate that the abundance column was greater than the sum of individuals at the different lengths.

# This is the same code used to make numlen_adj in the build:
get_convers <- function(trawldat){
  abundance_check <- trawldat %>%
    group_by(id, comname, catchsex, abundance) %>%
    summarise(
      numlen_abund = sum(numlen),                
      n_len_class  = n_distinct(length), # number of distinct lengths caught that station
      .groups      = "keep") %>% 
    ungroup()
  
  # Conversion factor translates each size at length to what it would need be to be
  # for them to add up to that "abundance" column
  conv_factor <- trawldat %>% 
    distinct(id, comname, catchsex, length, abundance) %>% 
    inner_join(abundance_check, by = c("id", "comname", "catchsex", "abundance")) %>% 
    mutate(convers = abundance / numlen_abund)
    
  
  
  # Merge back and convert the numlen field
  trawldat_adjusted <- trawldat %>%
    left_join(conv_factor, by = c("id", "comname", "catchsex", "length", "abundance", "n_len_class")) %>%
    mutate(numlen_adj = numlen * convers, .after = numlen) 
  
  return(trawldat_adjusted)
}



# Put all the groups together to compare abundance ratios
trawldat_list <- list(
    "2016" = get_convers(trawldat_16),
    "2019" = get_convers(trawldat_19),
    "2020" = get_convers(trawldat_20),
    "2020 re-check" = get_convers(trawldat_20_check),
    "2021" = get_convers(trawldat_21)
    )

trawldat_combined <- bind_rows(
  trawldat_list, 
  .id = "survdat_source"
)

All Species

trawldat_combined %>% 
  mutate(season = fct_rev(season)) %>% 
  ggplot(aes(y = convers, 
             x  = survdat_source, 
             color = survdat_source)) +
  geom_boxplot() +
  facet_grid(season ~ svvessel) + 
  scale_color_gmri() +
  labs(subtitle = "numlen to abundance ratio - All Species",
       y = "(survdat$abundance / sum(numlen))",
       x = "",
       caption = "Ratios calculated at each station for each species & sex.\nMay reflect change in catch composition.")

Haddock Only

trawldat_combined %>% 
  filter(comname == "haddock") %>% 
  mutate(season = fct_rev(season)) %>% 
  ggplot(aes(y = convers, 
             x  = survdat_source, 
             color = survdat_source)) +
  geom_boxplot() +
  facet_grid(season ~ svvessel) + 
  scale_color_gmri() +
  labs(subtitle = "numlen to abundance ratio - Haddock only",
       y = "(survdat$abundance / sum(numlen))",
       x = "",
       caption = "Ratio value calculated at each station/sex for haddock.")

Single Species Checks

The abund/numlen ratio is helpful for identifying consistency in behavior across the different pulls, but doesn’t really explain the how/why of the differences among them.

To get at the differences that contribute to them we use four commonly caught species to compare different components of the data sets.

For each of the following species we are comparing 4 main things across the different survdat sources:

  1. The number of unique station id’s each year
  2. The total number of fish, as tallied from the survdat$numlen column
  3. The total abundance of fish as the sum of survdat$abundance from distinct stations and the species caught at them
  4. The total biomass for each year as the sum of survdat$biomass from distinct stations and the species caught at them

This should tell us whether some datasets are missing stations completely, and whether the differences exist in the numlen, abundance, or biomass columns, and for what time periods.

For each of the following plots the drawing order of the lines is top to bottom as they appear in the key. If lines do not appear they likely are prefectly masked by some of the more recent pulls, which is good.

Differences in datasets are more apparent in the biomass and abundance totals during the Bigelow survey years.

NOTE: The Below code chunk details the functions used to isolate a given species and pull its information for display. Subsequent tabs will detail the differences among the different survdat pulls.

# Function to pull single species data
species_select <- function(species_choice){
  
  # Use list of sources to pull select species from each
  source_list <- trawldat_list
  
  # Use map to not repeat so much code
  select_species <- map_dfr(source_list, function(surv_source){
    surv_source %>% 
      select(survdat_source, id, est_year, season, comname, length, 
             numlen, abundance, biom_adj) %>% 
      filter(comname == species_choice)
  })
  
  return(select_species)
}




####  Plotting Numlen Totals and Station counts  ####
plot_stations <- function(combined_data, species_choice){
  
  # Total numlen fish summary
  species_numlens <- combined_data %>% 
    group_by(survdat_source, est_year, season) %>% 
    summarise(n_stations = n_distinct(id),
              sum_numlen = sum(numlen, na.rm = T),
              .groups = "keep")  %>% ungroup() %>% 
    mutate(season = fct_rev(season))
  
  
  # Plot for station presence
  station_plot <- ggplot(species_numlens, 
                         aes(est_year, n_stations, color = survdat_source)) +
    geom_line() +
    geom_jitter(aes(group = survdat_source, color = survdat_source, 
                  shape = survdat_source), size = 1.25, 
              width = 0.15) +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "Total Stations of Species Presence")
  
  
  # Plot for numlen
  numlen_plot <- ggplot(species_numlens, 
                        aes(est_year, sum_numlen, color = survdat_source)) +
    geom_line() +
    geom_jitter(aes(group = survdat_source, color = survdat_source, 
                  shape = survdat_source), size = 1.25, 
              width = 0.15) +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "Total Abundance using survdat$numlen", 
         caption = paste0(species_choice, " only"))
  
  plot_out <- station_plot / numlen_plot
  return(plot_out)
}





####  Plotting Biomass and Abundances  ####
plot_biomass <- function(combined_data, species_choice){
  
  # Abundance and Biomass Summary
  species_abund_bio <- combined_data %>% 
    distinct(survdat_source, est_year, season, id, abundance, biom_adj) %>% 
    group_by(survdat_source, est_year, season) %>% 
    summarise(n_stations = n_distinct(id),
              abundance = sum(abundance, na.rm = T),
              biomass = sum(biom_adj, na.rm = T),
              .groups = "keep") %>%  
    ungroup() %>% 
    mutate(season = fct_rev(season))
  
  
  # Plot for abundance
  abund_plot <- ggplot(species_abund_bio, 
                       aes(est_year, abundance, color = survdat_source)) +
    geom_line() +
    geom_jitter(aes(group = survdat_source, color = survdat_source, 
                    shape = survdat_source), size = 1.25, 
              width = 0.15) +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "Total survdat$abundance")
  
  # plot for biomass
  bio_plot <- ggplot(species_abund_bio, aes(est_year, biomass, color = survdat_source)) +
    geom_line() +
    geom_jitter(aes(group = survdat_source, color = survdat_source, 
                    shape = survdat_source), size = 1.25, 
              width = 0.15) +
    facet_wrap(~season) +
    scale_color_gmri() +
    labs(x = "", y = "Total survdat$biomass", 
         caption = paste0(species_choice, " only"))
  
  plot_out <- abund_plot / bio_plot
  return(plot_out)
  
}

NOTE: The following plots have had their points jittered to expose situations of overlap.

Haddock

Stations and Numlen Totals

species_choice <- "haddock"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Spiny dogfish

Stations and Numlen Totals

species_choice <- "spiny dogfish"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Atlantic Cod

Stations and Numlen Totals

species_choice <- "atlantic cod"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Atlantic Herring

Stations and Numlen Totals

species_choice <- "atlantic herring"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)

Abundance and Biomass

plot_biomass(single_species, species_choice)

Dataset Contents

This section will detail how each data set compares along a handful of common metrics. How many stations there are, how much biomass there is, which strata are present etc.

These should all be about the same, but may point us in the direction of what might be causing these differences.

trawldat_combined %>% 
  rename(`Survdat Source` = survdat_source) %>% 
  group_by(`Survdat Source`) %>% 
  summarise(
    `n Years` = n_distinct(est_year),
    `n Stations` = n_distinct(id),
    `n Strata` = n_distinct(stratum),
    `n Species` = n_distinct(comname)) %>% 
  kable() %>% 
  kable_styling()
Survdat Source n Years n Stations n Strata n Species
2016 16 7244 53 323
2019 19 8197 53 327
2020 20 8920 53 333
2020 re-check 20 8920 53 333
2021 20 8888 53 325

Summary

Based on the visualizations above there appears to be some inconsistency in the different datasets centered around data collected after 2008 on the RV Henry Bigelow.

These datasets seem to fall into 3 distinct groups when looking at abundance:

  1. Pre-2020

  2. 2020 Survdat Nye Data

  3. 2021 Survdat Pull

And two different groups when looking at biomass:

  1. 2021 Survdat Pull

  2. Everything else

species_choice <- "atlantic cod"
single_species <- species_select(species_choice) 


# Label the groups that we identified as behaving similarly:
cod_sources <- single_species %>% 
  mutate(`survdat group` = case_when(
    survdat_source %in% c("2016", "2019") ~ "Pre-2020 Data",
    str_detect(survdat_source, "2020") ~ "2020 Data",
    survdat_source == "2021" ~ "Sean's 2021 Data"),
    `survdat group` = factor(`survdat group`, 
                             levels = c("Pre-2020 Data", "2020 Data", 
                                        "Sean's 2021 Data"))) %>% 
  distinct(`survdat group`, survdat_source, est_year, season, id, abundance, biom_adj) %>% 
  group_by(`survdat group`, survdat_source, est_year, season) %>% 
  summarise(n_stations = n_distinct(id),
            abundance = sum(abundance, na.rm = T),
            biomass = sum(biom_adj, na.rm = T),
            .groups = "keep") %>%  
    ungroup() %>% 
    mutate(season = fct_rev(season))
  

#ellipse marker
mark1 <- cod_sources %>% 
  filter(season == "FALL", survdat_source == "2021", est_year == 2009) 

Abundance

# Abundance
cod_sources %>% 
  filter(season == "FALL") %>% 
  ggplot(aes(est_year, abundance)) +
  geom_line(aes(group = survdat_source, color = `survdat group`), alpha = 0.3) +
  geom_jitter(aes(group = survdat_source, color = `survdat group`, 
                  shape = survdat_source), size = 1.25, 
              width = 0.15) +
  geom_mark_ellipse(data = mark1, 
                    aes(est_year, abundance, label = "Separation of the 3 groups"), 
                    linetype = 2) +
  scale_color_gmri() +
  facet_wrap(~season) +
  facet_zoom(xlim = c(2008:2011)) +
  labs(x = "", 
       y = "Total survdat$abundance", 
       title = "Data Mismatches Following Vessel Switch", 
       subtitle = "Inconsistency in Bigelow data over time.", 
       caption = paste0(species_choice, " only"),
       shape = "Pull Source",
       color = "Consistent Grouping") 

Biomass

# Biomass
cod_sources %>% 
  filter(season == "FALL") %>% 
  ggplot(aes(est_year, biomass)) +
  geom_line(aes(group = survdat_source, color = `survdat group`), alpha = 0.3) +
  geom_jitter(aes(group = survdat_source, color = `survdat group`, 
                  shape = survdat_source), size = 1.25,
              width = 0.15) +
  geom_mark_ellipse(data = mark1, 
                    aes(est_year, biomass, label = "Separation of the 2 groups"), 
                    linetype = 2) +
  scale_color_gmri() +
  facet_wrap(~season) +
  facet_zoom(xlim = c(2007:2013)) +
  labs(x = "", 
       y = "Total survdat$biomass", 
       title = "Data Mismatches Following Vessel Switch", 
       subtitle = "Inconsistency in Bigelow data over time.", 
       caption = paste0(species_choice, " only"),
       shape = "Pull Source",
       color = "Consistent Grouping") 

 

A work by Adam A. Kemberling

Akemberling@gmri.org